Title¶
Developed by (Name: Rishitha Nimma)
In [1]:
# Import your data easy to read
# Import your data easy to read
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
df = pd.read_excel('cardekho_dataset.xlsx')
df.head()
Out[1]:
| brand | model | vehicle_age | km_driven | fuel_type | transmission_type | mileage | engine | seats | selling_price | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Maruti | Alto | 9 | 120000 | Petrol | Manual | 19.70 | 796 | 5 | 120000 |
| 1 | Hyundai | Grand | 5 | 20000 | Petrol | Manual | 18.90 | 1197 | 5 | 550000 |
| 2 | Hyundai | i20 | 11 | 60000 | Petrol | Manual | 17.00 | 1197 | 5 | 215000 |
| 3 | Maruti | Alto | 9 | 37000 | Petrol | Manual | 20.92 | 998 | 5 | 226000 |
| 4 | Ford | Ecosport | 6 | 30000 | Diesel | Manual | 22.77 | 1498 | 5 | 570000 |
Import libraries for EDA¶
In [2]:
# Import libraries for EDA
# Import libraries for EDA
## Imported required libraries in 2.1 already
# Import your data easy to read
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
Review Data¶
In [3]:
# Check data dimension by using .shape
df.shape ## displays the shape of dataframe
Out[3]:
(15411, 10)
In [4]:
# Check types of data by .info() ; .dtype
df.info() # Displays DataFrame summary, including types.
<class 'pandas.core.frame.DataFrame'> RangeIndex: 15411 entries, 0 to 15410 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 brand 15411 non-null object 1 model 15411 non-null object 2 vehicle_age 15411 non-null int64 3 km_driven 15411 non-null int64 4 fuel_type 15411 non-null object 5 transmission_type 15411 non-null object 6 mileage 15411 non-null float64 7 engine 15411 non-null int64 8 seats 15411 non-null int64 9 selling_price 15411 non-null int64 dtypes: float64(1), int64(5), object(4) memory usage: 1.2+ MB
In [5]:
df.dtypes ## displays data types of all the variables in the dataframe
Out[5]:
brand object model object vehicle_age int64 km_driven int64 fuel_type object transmission_type object mileage float64 engine int64 seats int64 selling_price int64 dtype: object
In [6]:
# For quantitative variable, generate a table for the count, mean, standard deviation, minimum and maximum values and the quantities of the data
df.describe() ## displays descriptive statistics summary for numeric variables
Out[6]:
| vehicle_age | km_driven | mileage | engine | seats | selling_price | |
|---|---|---|---|---|---|---|
| count | 15411.000000 | 1.541100e+04 | 15411.000000 | 15411.000000 | 15411.000000 | 1.541100e+04 |
| mean | 6.036338 | 5.561648e+04 | 19.701151 | 1486.057751 | 5.325482 | 7.749711e+05 |
| std | 3.013291 | 5.161855e+04 | 4.171265 | 521.106696 | 0.807628 | 8.941284e+05 |
| min | 0.000000 | 1.000000e+02 | 4.000000 | 793.000000 | 0.000000 | 4.000000e+04 |
| 25% | 4.000000 | 3.000000e+04 | 17.000000 | 1197.000000 | 5.000000 | 3.850000e+05 |
| 50% | 6.000000 | 5.000000e+04 | 19.670000 | 1248.000000 | 5.000000 | 5.560000e+05 |
| 75% | 8.000000 | 7.000000e+04 | 22.700000 | 1582.000000 | 5.000000 | 8.250000e+05 |
| max | 29.000000 | 3.800000e+06 | 33.540000 | 6592.000000 | 9.000000 | 3.950000e+07 |
Generate Sub-dataset¶
In [7]:
# 1. Make sub-dataset(s) from your original dataset for your research objective, goals by dropping unnecessary variables
# Creating sub-datasets for each of the three questions by dropping unnecessary variables
# Load the dataset again for subsetting purposes
data_sub = df.copy() # Creates a duplicate DataFrame copy.
# Sub-dataset for question 1: Factors contributing to high mileage
mileage_dataset = data_sub.drop(columns=['brand', 'model', 'selling_price','seats']) # Removes specified columns from DataFrame.
# Sub-dataset for question 2: Vehicle characteristics affecting price of automatic vs. manual transmission cars
transmission_dataset = data_sub.drop(columns=[ 'vehicle_age', 'km_driven', 'seats','brand','model'])
# Sub-dataset for question 3: Influence of fuel type on performance (mileage) and price
fuel_performance_dataset = data_sub.drop(columns=['brand', 'model', 'vehicle_age', 'km_driven','transmission_type', 'seats', 'engine'])
# Display the first few rows of each sub-dataset to ensure they are created correctly
mileage_dataset.head(), transmission_dataset.head(), fuel_performance_dataset.head()
Out[7]:
( vehicle_age km_driven fuel_type transmission_type mileage engine 0 9 120000 Petrol Manual 19.70 796 1 5 20000 Petrol Manual 18.90 1197 2 11 60000 Petrol Manual 17.00 1197 3 9 37000 Petrol Manual 20.92 998 4 6 30000 Diesel Manual 22.77 1498, fuel_type transmission_type mileage engine selling_price 0 Petrol Manual 19.70 796 120000 1 Petrol Manual 18.90 1197 550000 2 Petrol Manual 17.00 1197 215000 3 Petrol Manual 20.92 998 226000 4 Diesel Manual 22.77 1498 570000, fuel_type mileage selling_price 0 Petrol 19.70 120000 1 Petrol 18.90 550000 2 Petrol 17.00 215000 3 Petrol 20.92 226000 4 Diesel 22.77 570000)
In [8]:
# try to change variable names
# Renaming columns for better clarity in each sub-dataset
# Sub-dataset for question 1: Factors contributing to high mileage
mileage_dataset = mileage_dataset.rename(columns={
'vehicle_age': 'age_of_vehicle',
'transmission_type': 'gearbox_type',
'km_driven': 'kilometers_driven',
'fuel_type': 'type_of_fuel',
'mileage': 'car_mileage',
'engine': 'engine_capacity'
}) # renaming the columns of dataframe
# Sub-dataset for question 2: Vehicle characteristics affecting price of automatic vs. manual transmission cars
transmission_dataset = transmission_dataset.rename(columns={
'fuel_type': 'type_of_fuel',
'transmission_type': 'gearbox_type',
'mileage': 'car_mileage',
'engine': 'engine_capacity',
'selling_price': 'price'
})
# Sub-dataset for question 3: Influence of fuel type on performance (mileage) and price
fuel_performance_dataset = fuel_performance_dataset.rename(columns={
'fuel_type': 'type_of_fuel',
'mileage': 'car_mileage',
'selling_price': 'price'
})
# Display the first few rows of each renamed sub-dataset
mileage_dataset.head(), transmission_dataset.head(), fuel_performance_dataset.head()
Out[8]:
( age_of_vehicle kilometers_driven type_of_fuel gearbox_type car_mileage \
0 9 120000 Petrol Manual 19.70
1 5 20000 Petrol Manual 18.90
2 11 60000 Petrol Manual 17.00
3 9 37000 Petrol Manual 20.92
4 6 30000 Diesel Manual 22.77
engine_capacity
0 796
1 1197
2 1197
3 998
4 1498 ,
type_of_fuel gearbox_type car_mileage engine_capacity price
0 Petrol Manual 19.70 796 120000
1 Petrol Manual 18.90 1197 550000
2 Petrol Manual 17.00 1197 215000
3 Petrol Manual 20.92 998 226000
4 Diesel Manual 22.77 1498 570000,
type_of_fuel car_mileage price
0 Petrol 19.70 120000
1 Petrol 18.90 550000
2 Petrol 17.00 215000
3 Petrol 20.92 226000
4 Diesel 22.77 570000)
In [9]:
# check if there are missing values
# Checking for missing values in all sub-datasets
# Checking the number of missing values in the main dataset
missing_values_main = df.isnull().sum()
# Checking missing values in each sub-dataset
missing_values_mileage = mileage_dataset.isnull().sum()
missing_values_transmission = transmission_dataset.isnull().sum()
missing_values_fuel_performance = fuel_performance_dataset.isnull().sum()
missing_values_main, missing_values_mileage, missing_values_transmission, missing_values_fuel_performance
Out[9]:
(brand 0 model 0 vehicle_age 0 km_driven 0 fuel_type 0 transmission_type 0 mileage 0 engine 0 seats 0 selling_price 0 dtype: int64, age_of_vehicle 0 kilometers_driven 0 type_of_fuel 0 gearbox_type 0 car_mileage 0 engine_capacity 0 dtype: int64, type_of_fuel 0 gearbox_type 0 car_mileage 0 engine_capacity 0 price 0 dtype: int64, type_of_fuel 0 car_mileage 0 price 0 dtype: int64)
In [10]:
# Check if there are duplicated values
# Checking for duplicates in the main dataset
duplicates_main = df.duplicated().sum()
# Checking for duplicates in each sub-dataset
duplicates_mileage = mileage_dataset.duplicated().sum()
duplicates_transmission = transmission_dataset.duplicated().sum()
duplicates_fuel_performance = fuel_performance_dataset.duplicated().sum()
duplicates_main, duplicates_mileage, duplicates_transmission, duplicates_fuel_performance
Out[10]:
(189, 1919, 5858, 6391)
Checking Outliers¶
In [11]:
# check if quantitative variables have outliers.
# Check for outliers using the IQR method for each numeric variable
def detect_outliers(dataset, numeric_columns):
outlier_report = {}
for column in numeric_columns:
Q1 = dataset[column].quantile(0.25)
Q3 = dataset[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = ((dataset[column] < lower_bound) | (dataset[column] > upper_bound)).sum()
outlier_report[column] = outliers
return outlier_report
# Detecting outliers in each sub-dataset
numeric_columns_mileage = ['age_of_vehicle', 'kilometers_driven', 'car_mileage', 'engine_capacity']
numeric_columns_transmission = ['car_mileage', 'engine_capacity', 'price']
numeric_columns_fuel_performance = ['car_mileage', 'price']
# Outliers for mileage dataset
outliers_mileage = detect_outliers(mileage_dataset, numeric_columns_mileage)
# Outliers for transmission dataset
outliers_transmission = detect_outliers(transmission_dataset, numeric_columns_transmission)
# Outliers for fuel performance dataset
outliers_fuel_performance = detect_outliers(fuel_performance_dataset, numeric_columns_fuel_performance)
outliers_mileage, outliers_transmission, outliers_fuel_performance
Out[11]:
({'age_of_vehicle': 154,
'kilometers_driven': 466,
'car_mileage': 88,
'engine_capacity': 2130},
{'car_mileage': 88, 'engine_capacity': 2130, 'price': 1386},
{'car_mileage': 88, 'price': 1386})
In [12]:
# Function to remove outliers based on IQR method
def remove_outliers(dataset, numeric_columns):
for column in numeric_columns:
Q1 = dataset[column].quantile(0.25)
Q3 = dataset[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
dataset = dataset[(dataset[column] >= lower_bound) & (dataset[column] <= upper_bound)]
return dataset
# Remove outliers for mileage dataset
mileage_dataset_clean = remove_outliers(mileage_dataset, numeric_columns_mileage)
# Remove outliers for transmission dataset
transmission_dataset_clean = remove_outliers(transmission_dataset, numeric_columns_transmission)
# Remove outliers for fuel performance dataset
fuel_performance_dataset_clean = remove_outliers(fuel_performance_dataset, numeric_columns_fuel_performance)
# Display the cleaned datasets
mileage_dataset_clean.head(), transmission_dataset_clean.head(), fuel_performance_dataset_clean.head()
Out[12]:
( age_of_vehicle kilometers_driven type_of_fuel gearbox_type car_mileage \
0 9 120000 Petrol Manual 19.70
1 5 20000 Petrol Manual 18.90
2 11 60000 Petrol Manual 17.00
3 9 37000 Petrol Manual 20.92
4 6 30000 Diesel Manual 22.77
engine_capacity
0 796
1 1197
2 1197
3 998
4 1498 ,
type_of_fuel gearbox_type car_mileage engine_capacity price
0 Petrol Manual 19.70 796 120000
1 Petrol Manual 18.90 1197 550000
2 Petrol Manual 17.00 1197 215000
3 Petrol Manual 20.92 998 226000
4 Diesel Manual 22.77 1498 570000,
type_of_fuel car_mileage price
0 Petrol 19.70 120000
1 Petrol 18.90 550000
2 Petrol 17.00 215000
3 Petrol 20.92 226000
4 Diesel 22.77 570000)
Generating Plot(s)¶
In [13]:
# generate plots to support your objective and goals
import matplotlib.pyplot as plt
import seaborn as sns
# Set up Seaborn style
sns.set(style="whitegrid")
# Box Plot for factors contributing to high mileage
plt.figure(figsize=(16, 12))
# Box plot for Age of Vehicle
plt.subplot(2, 2, 1)
sns.boxplot(x='age_of_vehicle', y='car_mileage', data=mileage_dataset_clean)
plt.title('Mileage by Age of Vehicle')
plt.xlabel('Age of Vehicle')
plt.ylabel('Mileage (kmpl)')
# Box plot for Kilometers Driven
plt.subplot(2, 2, 2)
sns.boxplot(x='gearbox_type', y='car_mileage', data=mileage_dataset_clean)
plt.title('Mileage by Kilometers Driven')
plt.xlabel('Kilometers Driven')
plt.ylabel('Mileage (kmpl)')
# Box plot for Fuel Type
plt.subplot(2, 2, 3)
sns.boxplot(x='type_of_fuel', y='car_mileage', data=mileage_dataset_clean)
plt.title('Mileage by Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Mileage (kmpl)')
# Box plot for Engine Capacity
plt.subplot(2, 2, 4)
sns.boxplot(x='engine_capacity', y='car_mileage', data=mileage_dataset_clean)
plt.title('Mileage by Engine Capacity')
plt.xlabel('Engine Capacity (cc)')
plt.ylabel('Mileage (kmpl)')
plt.tight_layout()
plt.show()
The box plots above provide insights into the factors contributing to high mileage.
- Older vehicles tend to have slightly lower mileage, but the variation is not extreme.
- Manual transmission vehicles tend to offer a higher mileage compared to automatic transmission vehicles, which shows a tighter distribution.
- Diesel vehicles generally offer higher mileage compared to petrol vehicles, as reflected in the higher median mileage for diesel cars.
- Smaller engine capacities tend to have higher mileage, while larger engines show lower mileage values, which is expected due to higher fuel consumption.
In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
# Set up Seaborn style
sns.set(style="whitegrid")
# Histogram for Kilometers Driven (Distribution)
plt.figure(figsize=(10, 6))
sns.histplot(mileage_dataset_clean['kilometers_driven'], kde=True, color='blue', bins=20)
plt.title('Distribution of Kilometers Driven')
plt.xlabel('Kilometers Driven')
plt.ylabel('Frequency')
plt.show()
# Histogram for Engine Capacity (Distribution)
plt.figure(figsize=(10, 6))
sns.histplot(mileage_dataset_clean['engine_capacity'], kde=True, color='green', bins=20)
plt.title('Distribution of Engine Capacity')
plt.xlabel('Engine Capacity (cc)')
plt.ylabel('Frequency')
plt.show()
The histograms above provide a clear view of the distribution of two key variables in the dataset:
- The distribution shows that most vehicles have driven a moderate amount of kilometers, with fewer cars at both the lower and higher ends. The distribution is slightly skewed, indicating that there are more cars with higher kilometers driven.
- This distribution highlights that the majority of the vehicles have engine capacities clustered around certain common values (such as 1000cc to 1500cc), with fewer vehicles having engines that are either much smaller or much larger.
In [15]:
# Scatter Plot for Kilometers Driven vs Car Mileage
plt.figure(figsize=(10, 6))
sns.scatterplot(x='engine_capacity', y='car_mileage', data=transmission_dataset_clean)
plt.title('Engine Cpacity vs. Car Mileage')
plt.xlabel('Engine Cpacity')
plt.ylabel('Car Mileage (kmpl)')
plt.show()
The scatter plot above shows the relationship between engine capacity and car mileage. Here are some key insights:
- As the engine capacity increases, there is a noticeable decline in car mileage, which is expected as larger engines typically consume more fuel.
In [16]:
# Hexbin Plot for Kilometers Driven vs Price (Alternative to Scatter Plot)
plt.figure(figsize=(10, 6))
plt.hexbin(fuel_performance_dataset_clean['price'], fuel_performance_dataset_clean['car_mileage'], gridsize=30, cmap='Blues')
plt.title('Hexbin Plot of Price vs. Car Mileage')
plt.xlabel('Price')
plt.ylabel('Car Mileage (kmpl)')
plt.colorbar(label='Density')
plt.show()
- The darker hexagons represent areas where many vehicles cluster around a particular price and mileage. These clusters indicate typical pricing ranges for vehicles with certain mileage.
- There is a slight tendency for higher-priced vehicles to have lower mileage, although the relationship isn't overly strong, suggesting other factors likely influence price beyond just mileage.
EDA¶
Question 1: What factors contribute to high mileage in used cars ?
In [17]:
mileage_dataset_clean.describe()
Out[17]:
| age_of_vehicle | kilometers_driven | car_mileage | engine_capacity | |
|---|---|---|---|---|
| count | 12869.000000 | 12869.000000 | 12869.000000 | 12869.000000 |
| mean | 5.789339 | 48798.379750 | 20.596419 | 1319.224959 |
| std | 2.810510 | 27197.863832 | 3.521140 | 284.008205 |
| min | 0.000000 | 100.000000 | 10.000000 | 793.000000 |
| 25% | 4.000000 | 27700.000000 | 18.100000 | 1197.000000 |
| 50% | 5.000000 | 47592.000000 | 20.360000 | 1248.000000 |
| 75% | 8.000000 | 67000.000000 | 23.000000 | 1497.000000 |
| max | 14.000000 | 130000.000000 | 30.480000 | 2157.000000 |
In [18]:
# perform your EDA
# Univariate Analysis
import matplotlib.pyplot as plt
import seaborn as sns
# Set up Seaborn style
sns.set(style="whitegrid")
# Summary statistics for Age of Vehicle
print("Summary Statistics for Age of Vehicle:")
print(mileage_dataset_clean['age_of_vehicle'].describe())
# Summary statistics for Kilometers Driven
print("\nSummary Statistics for Kilometers Driven:")
print(mileage_dataset_clean['kilometers_driven'].describe())
# Frequency count for Fuel Type
print("\nFuel Type Distribution:")
print(mileage_dataset_clean['type_of_fuel'].value_counts())
# Summary statistics for Engine Capacity
print("\nSummary Statistics for Engine Capacity:")
print(mileage_dataset_clean['engine_capacity'].describe())
# Summary statistics for Mileage
print("\nSummary Statistics for Mileage:")
print(mileage_dataset_clean['car_mileage'].describe())
# Frequency count for Fuel Type
print("\nTransmission Type Distribution:")
print(mileage_dataset_clean['gearbox_type'].value_counts())
# Visualizations
plt.figure(figsize=(16, 12))
Summary Statistics for Age of Vehicle: count 12869.000000 mean 5.789339 std 2.810510 min 0.000000 25% 4.000000 50% 5.000000 75% 8.000000 max 14.000000 Name: age_of_vehicle, dtype: float64 Summary Statistics for Kilometers Driven: count 12869.000000 mean 48798.379750 std 27197.863832 min 100.000000 25% 27700.000000 50% 47592.000000 75% 67000.000000 max 130000.000000 Name: kilometers_driven, dtype: float64 Fuel Type Distribution: Petrol 7331 Diesel 5269 CNG 227 LPG 42 Name: type_of_fuel, dtype: int64 Summary Statistics for Engine Capacity: count 12869.000000 mean 1319.224959 std 284.008205 min 793.000000 25% 1197.000000 50% 1248.000000 75% 1497.000000 max 2157.000000 Name: engine_capacity, dtype: float64 Summary Statistics for Mileage: count 12869.000000 mean 20.596419 std 3.521140 min 10.000000 25% 18.100000 50% 20.360000 75% 23.000000 max 30.480000 Name: car_mileage, dtype: float64 Transmission Type Distribution: Manual 10437 Automatic 2432 Name: gearbox_type, dtype: int64
Out[18]:
<Figure size 1600x1200 with 0 Axes>
<Figure size 1600x1200 with 0 Axes>
In [19]:
# Histogram for Age of Vehicle
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
sns.histplot(mileage_dataset_clean['age_of_vehicle'], kde=True, bins=20, color='blue')
plt.title('Distribution of Age of Vehicle')
plt.xlabel('Age of Vehicle (years)')
plt.ylabel('Frequency')
# Histogram for Kilometers Driven
plt.subplot(2, 2, 2)
sns.histplot(mileage_dataset_clean['kilometers_driven'], kde=True, bins=20, color='green')
plt.title('Distribution of Kilometers Driven')
plt.xlabel('Kilometers Driven')
plt.ylabel('Frequency')
# Bar plot for Fuel Type
plt.subplot(2, 2, 3)
sns.countplot(x='type_of_fuel', data=mileage_dataset_clean, palette='Set2')
plt.title('Fuel Type Distribution')
plt.xlabel('Fuel Type')
plt.ylabel('Count')
# Histogram for Engine Capacity
plt.subplot(2, 2, 4)
sns.histplot(mileage_dataset_clean['engine_capacity'], kde=True, bins=20, color='purple')
plt.title('Distribution of Engine Capacity')
plt.xlabel('Engine Capacity (cc)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
In [20]:
# Histogram for Mileage
plt.figure(figsize=(10, 6))
sns.histplot(mileage_dataset_clean['car_mileage'], kde=True, bins=20, color='red')
plt.title('Distribution of Car Mileage')
plt.xlabel('Mileage (kmpl)')
plt.ylabel('Frequency')
plt.show()
In [21]:
# Bar plot for Fuel Type
plt.figure(figsize=(10, 6))
sns.countplot(x='gearbox_type', data=mileage_dataset_clean, palette='Set2')
plt.title('Transmission Type Distribution')
plt.xlabel('Transmission Type')
plt.ylabel('Count')
Out[21]:
Text(0, 0.5, 'Count')
In [22]:
# Bivariate Analysis
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Set up Seaborn style
sns.set(style="whitegrid")
# Relationship between Age of Vehicle and Car Mileage
plt.figure(figsize=(10, 6))
sns.scatterplot(x='age_of_vehicle', y='car_mileage', data=mileage_dataset_clean, color='blue')
plt.title('Age of Vehicle vs. Car Mileage')
plt.xlabel('Age of Vehicle (years)')
plt.ylabel('Car Mileage (kmpl)')
plt.show()
# Calculate correlation
correlation_age_mileage = np.corrcoef(mileage_dataset_clean['age_of_vehicle'], mileage_dataset_clean['car_mileage'])[0, 1]
print(f"Correlation between Age of Vehicle and Car Mileage: {correlation_age_mileage:.2f}")
Correlation between Age of Vehicle and Car Mileage: -0.23
In [23]:
# Relationship between Kilometers Driven and Car Mileage
plt.figure(figsize=(10, 6))
sns.scatterplot(x='kilometers_driven', y='car_mileage', data=mileage_dataset_clean, color='green')
plt.title('Kilometers Driven vs. Car Mileage')
plt.xlabel('Kilometers Driven')
plt.ylabel('Car Mileage (kmpl)')
plt.show()
# Calculate correlation
correlation_km_mileage = np.corrcoef(mileage_dataset_clean['kilometers_driven'], mileage_dataset_clean['car_mileage'])[0, 1]
print(f"Correlation between Kilometers Driven and Car Mileage: {correlation_km_mileage:.2f}")
Correlation between Kilometers Driven and Car Mileage: 0.06
In [24]:
# Comparison of Mileage by Fuel Type
plt.figure(figsize=(10, 6))
sns.boxplot(x='type_of_fuel', y='car_mileage', data=mileage_dataset_clean, palette='Set2')
plt.title('Mileage by Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Car Mileage (kmpl)')
plt.show()
# Summary statistics for mileage by fuel type
print("\nSummary Statistics for Mileage by Fuel Type:")
print(mileage_dataset_clean.groupby('type_of_fuel')['car_mileage'].describe())
Summary Statistics for Mileage by Fuel Type:
count mean std min 25% 50% 75% max
type_of_fuel
CNG 227.0 23.703392 4.569385 11.88 20.880 26.20 26.60 30.48
Diesel 5269.0 22.305422 3.617784 10.00 19.870 22.54 25.20 28.40
LPG 42.0 17.953571 3.805650 13.45 15.275 17.30 18.70 26.20
Petrol 7331.0 19.287045 2.735209 10.50 17.400 18.90 21.01 28.09
In [25]:
# Relationship between Engine Capacity and Car Mileage
plt.figure(figsize=(10, 6))
sns.scatterplot(x='engine_capacity', y='car_mileage', data=mileage_dataset_clean, color='purple')
plt.title('Engine Capacity vs. Car Mileage')
plt.xlabel('Engine Capacity (cc)')
plt.ylabel('Car Mileage (kmpl)')
plt.show()
# Calculate correlation
correlation_engine_mileage = np.corrcoef(mileage_dataset_clean['engine_capacity'], mileage_dataset_clean['car_mileage'])[0, 1]
print(f"Correlation between Engine Capacity and Car Mileage: {correlation_engine_mileage:.2f}")
Correlation between Engine Capacity and Car Mileage: -0.38
In [26]:
# Set up Seaborn style
sns.set(style="whitegrid")
# Pair plot for multiple relationships, including Age of Vehicle and Car Mileage
plt.figure(figsize=(12, 10))
# Generate the pairplot
pairplot = sns.pairplot(mileage_dataset_clean[['age_of_vehicle', 'kilometers_driven', 'car_mileage', 'engine_capacity']], diag_kind='kde')
# Set the title outside the plot using plt.suptitle
plt.suptitle('Pairplot of Vehicle Age, Kilometers Driven, Car Mileage, and Engine Capacity',
y=1.02, fontsize=16)
# Adjust the layout to prevent title overlap
plt.tight_layout()
plt.show()
<Figure size 1200x1000 with 0 Axes>
In [27]:
# Bar Plot of Mean Mileage by Fuel Type
plt.figure(figsize=(10, 6))
fuel_mileage_mean = mileage_dataset_clean.groupby('type_of_fuel')['car_mileage'].mean().reset_index()
sns.barplot(x='type_of_fuel', y='car_mileage', data=fuel_mileage_mean, palette='Set2')
plt.title('Mean Mileage by Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Mean Car Mileage (kmpl)')
plt.show()
In [28]:
# multivariate analysis
# Calculate the correlation matrix
correlation_matrix = mileage_dataset_clean[['age_of_vehicle', 'kilometers_driven', 'engine_capacity', 'car_mileage']].corr()
# Create a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Heatmap of Correlations Between Variables')
plt.show()
In [29]:
# Create a FacetGrid to examine the relationship between Age of Vehicle, Kilometers Driven, and Mileage by Fuel Type
g = sns.FacetGrid(mileage_dataset_clean, col="type_of_fuel", height=5, aspect=1)
g.map(sns.scatterplot, "age_of_vehicle", "car_mileage", alpha=0.6)
g.add_legend()
# Add titles and labels
g.set_axis_labels("Age of Vehicle (years)", "Car Mileage (kmpl)")
g.set_titles(col_template="{col_name} Cars")
plt.suptitle('FacetGrid: Mileage vs. Age of Vehicle by Fuel Type', y=1.02)
plt.tight_layout()
plt.show()
In [30]:
# FacetGrid for Kilometers Driven vs. Car Mileage by Fuel Type
g = sns.FacetGrid(mileage_dataset_clean, col="type_of_fuel", height=5, aspect=1)
g.map(sns.scatterplot, "kilometers_driven", "car_mileage", alpha=0.6)
g.add_legend()
# Add titles and labels
g.set_axis_labels("Kilometers Driven", "Car Mileage (kmpl)")
g.set_titles(col_template="{col_name} Cars")
plt.suptitle('FacetGrid: Mileage vs. Kilometers Driven by Fuel Type', y=1.02)
plt.tight_layout()
plt.show()
In [31]:
# FacetGrid for Engine Capacity vs. Car Mileage by Fuel Type
g = sns.FacetGrid(mileage_dataset_clean, col="type_of_fuel", height=5, aspect=1)
g.map(sns.scatterplot, "engine_capacity", "car_mileage", alpha=0.6)
g.add_legend()
# Add titles and labels
g.set_axis_labels("Engine Capacity (cc)", "Car Mileage (kmpl)")
g.set_titles(col_template="{col_name} Cars")
plt.suptitle('FacetGrid: Mileage vs. Engine Capacity by Fuel Type', y=1.02)
plt.tight_layout()
plt.show()
In [32]:
# FacetGrid to display transmission type vs car mileage faceted by fuel type
g = sns.FacetGrid(mileage_dataset_clean, col="type_of_fuel", height=5, aspect=1)
g.map(sns.stripplot, "gearbox_type", "car_mileage", jitter=True, palette="Set2", alpha=0.7)
g.add_legend()
# Add titles and labels
g.set_axis_labels("Transmission Type", "Car Mileage (kmpl)")
g.set_titles(col_template="{col_name} Cars")
plt.suptitle('Stripplot: Transmission Type vs Car Mileage Faceted by Fuel Type', y=1.02)
plt.tight_layout()
plt.show()
Question 2: How does the combination of vehicle characteristics affect the selling price of automatic vs. manual transmission cars?
In [33]:
transmission_dataset_clean.describe()
Out[33]:
| car_mileage | engine_capacity | price | |
|---|---|---|---|
| count | 12395.000000 | 12395.000000 | 1.239500e+04 |
| mean | 20.788276 | 1278.929407 | 5.371584e+05 |
| std | 3.458984 | 241.935832 | 2.418689e+05 |
| min | 10.000000 | 793.000000 | 4.000000e+04 |
| 25% | 18.500000 | 1197.000000 | 3.500000e+05 |
| 50% | 20.510000 | 1248.000000 | 5.000000e+05 |
| 75% | 23.010000 | 1497.000000 | 6.800000e+05 |
| max | 30.480000 | 2148.000000 | 1.280000e+06 |
In [34]:
# Univariate Analysis
import matplotlib.pyplot as plt
# Pie chart for Transmission Type
plt.figure(figsize=(8, 6))
transmission_counts = transmission_dataset_clean['gearbox_type'].value_counts()
plt.pie(transmission_counts, labels=transmission_counts.index, autopct='%1.1f%%', startangle=90, colors=['#ff9999','#66b3ff'])
plt.title('Pie Chart: Transmission Type Distribution')
plt.show()
# Pie chart for Fuel Type
plt.figure(figsize=(8, 6))
fuel_counts = transmission_dataset_clean['type_of_fuel'].value_counts()
# Create pie chart without labels in the pie, and add a legend instead
plt.pie(fuel_counts, autopct='%1.1f%%', startangle=90, colors=['#ff9999','#66b3ff', '#99ff99', '#ffcc99'])
plt.title('Pie Chart: Fuel Type Distribution')
# Add a legend to show the labels
plt.legend(labels=fuel_counts.index, loc="best")
plt.tight_layout()
plt.show()
In [35]:
plt.figure(figsize=(8, 6))
sns.histplot(transmission_dataset_clean['engine_capacity'], kde=True, bins=5, color='green')
plt.title('Histogram: Engine Capacity Distribution')
plt.xlabel('Engine Capacity (cc)')
plt.ylabel('Frequency')
plt.show()
In [36]:
# Density Plot for Selling Price
plt.figure(figsize=(8, 6))
sns.kdeplot(transmission_dataset_clean['price'], fill=True, color='purple')
plt.title('Density Plot: Selling Price Distribution')
plt.xlabel('Price (INR)')
plt.ylabel('Density')
plt.show()
In [37]:
# Density Plot for Selling Price
plt.figure(figsize=(8, 6))
sns.kdeplot(transmission_dataset_clean['car_mileage'], fill=True, color='purple')
plt.title('Density Plot: Car Mileage Distribution')
plt.xlabel('Car Mileage (Km)')
plt.ylabel('Density')
plt.show()
In [38]:
# Bivariate Analysis
# Histogram for Selling Price
plt.figure(figsize=(10, 6))
sns.histplot(data=transmission_dataset_clean, x='price', hue='gearbox_type', kde=True, bins=30, palette="Set2")
plt.title('Histogram: Selling Price Distribution by Transmission Type')
plt.xlabel('Selling Price (INR)')
plt.ylabel('Frequency')
plt.show()
In [39]:
# Histogram for Engine Capacity
plt.figure(figsize=(10, 6))
sns.histplot(data=transmission_dataset_clean, x='engine_capacity', hue='gearbox_type', kde=True, bins=30, palette="Set2")
plt.title('Histogram: Engine Capacity Distribution by Transmission Type')
plt.xlabel('Engine Capacity (cc)')
plt.ylabel('Frequency')
plt.show()
In [40]:
# Scatter Plot for Car Mileage vs. Price with Gearbox Type as Hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='car_mileage', y='price', data=transmission_dataset_clean, palette='Set2', s=100)
plt.title('Scatter Plot: Car Mileage vs. Price ')
plt.xlabel('Car Mileage (kmpl)')
plt.ylabel('Price (INR)')
plt.show()
In [41]:
# Scatter Plot for Car Mileage vs. Price with Gearbox Type as Hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='gearbox_type', y='car_mileage', data=transmission_dataset_clean, palette='Set2', s=100)
plt.title('Scatter Plot: GearBox Type vs. Car Mileage')
plt.xlabel('GearBox Type')
plt.ylabel('Car Mileage')
plt.show()
In [42]:
# Box Plot for Type of Fuel vs. Price by Gearbox Type
plt.figure(figsize=(10, 6))
sns.boxplot(x='type_of_fuel', y='price', data=transmission_dataset_clean, palette='Set2')
plt.title('Box Plot: Fuel Type vs. Price ')
plt.xlabel('Fuel Type')
plt.ylabel('Price (INR)')
plt.show()
In [43]:
# Box Plot for Type of Fuel vs. Price by Gearbox Type
plt.figure(figsize=(10, 6))
sns.scatterplot(x='gearbox_type', y='type_of_fuel', data=transmission_dataset_clean, palette='Set2')
plt.title('Box Plot: Gearbox Type vs. Fuel Type ')
plt.xlabel('Gearbox Type')
plt.ylabel('Fuel Type')
plt.show()
In [44]:
# Scatter Plot for Engine Capacity vs. Price with Gearbox Type as Hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='engine_capacity', y='price', data=transmission_dataset_clean, palette='Set2', s=100)
plt.title('Scatter Plot: Engine Capacity vs. Price ')
plt.xlabel('Engine Capacity (cc)')
plt.ylabel('Price (INR)')
plt.show()
In [45]:
# Scatter Plot for Engine Capacity vs. Price with Gearbox Type as Hue
plt.figure(figsize=(10, 6))
sns.scatterplot(x='gearbox_type', y='engine_capacity', data=transmission_dataset_clean, palette='Set2')
plt.title('Scatter Plot: Gearbox Type vs. Engine Capacity ')
plt.xlabel('Gearbox Type')
plt.ylabel('Engine Capacity')
plt.show()
In [46]:
# Violin Plot for Type of Fuel vs. Price by Gearbox Type
plt.figure(figsize=(10, 6))
sns.violinplot(x='type_of_fuel', y='price', data=transmission_dataset_clean, split=True, palette='Set2')
plt.title('Violin Plot: Fuel Type vs. Price ')
plt.xlabel('Fuel Type')
plt.ylabel('Price (INR)')
plt.show()
In [47]:
plt.figure(figsize=(10, 6))
sns.countplot(x='gearbox_type', hue='type_of_fuel', data=transmission_dataset_clean, palette='Set2')
plt.title('Count Plot: Gearbox Type by Fuel Type')
plt.xlabel('Gearbox Type')
plt.ylabel('Count')
plt.show()
In [48]:
# Multivariate Analysis
# Pair Plot for Car Mileage, Engine Capacity, and Price with Gearbox Type as Hue
sns.pairplot(transmission_dataset_clean, hue='gearbox_type', diag_kind='kde', palette='Set2', height=2.5)
plt.suptitle('Pair Plot: Car Mileage, Engine Capacity, and Price by Gearbox Type', y=1.02)
plt.show()
In [49]:
transmission_dataset_clean.head()
Out[49]:
| type_of_fuel | gearbox_type | car_mileage | engine_capacity | price | |
|---|---|---|---|---|---|
| 0 | Petrol | Manual | 19.70 | 796 | 120000 |
| 1 | Petrol | Manual | 18.90 | 1197 | 550000 |
| 2 | Petrol | Manual | 17.00 | 1197 | 215000 |
| 3 | Petrol | Manual | 20.92 | 998 | 226000 |
| 4 | Diesel | Manual | 22.77 | 1498 | 570000 |
In [50]:
# Plotting type_of_fuel vs price with hue as gearbox_type
plt.figure(figsize=(10, 6))
sns.boxplot(x='type_of_fuel', y='price', hue='gearbox_type', data=transmission_dataset_clean, palette='Set2')
plt.title('Box Plot: Fuel Type vs. Price by Gearbox Type')
plt.xlabel('Fuel Type')
plt.ylabel('Price (INR)')
plt.show()
In [51]:
# FacetGrid: Car Mileage vs Price by Fuel Type and Gearbox Type
g = sns.FacetGrid(transmission_dataset_clean, col="type_of_fuel", hue="gearbox_type", height=5, aspect=1.2)
g.map(sns.scatterplot, "car_mileage", "price", s=100, alpha=0.7)
g.add_legend()
g.set_axis_labels("Car Mileage (kmpl)", "Price (INR)")
g.set_titles(col_template="{col_name} Cars")
plt.suptitle('FacetGrid: Price vs. Car Mileage by Fuel Type and Gearbox Type', y=1.02)
plt.tight_layout()
plt.show()
In [52]:
# Heatmap: Correlation of Continuous Variables
corr_matrix = transmission_dataset_clean[['car_mileage', 'engine_capacity', 'price']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title('Heatmap of Correlation Matrix')
plt.show()
Question 3 : How does fuel type (Petrol vs. Diesel) influence car performance in terms of mileage and price?
In [53]:
fuel_performance_dataset_clean.describe()
Out[53]:
| car_mileage | price | |
|---|---|---|
| count | 13948.000000 | 1.394800e+04 |
| mean | 20.072265 | 5.747825e+05 |
| std | 3.899249 | 2.785681e+05 |
| min | 9.000000 | 4.000000e+04 |
| 25% | 17.400000 | 3.700000e+05 |
| 50% | 20.000000 | 5.300000e+05 |
| 75% | 22.770000 | 7.250000e+05 |
| max | 30.480000 | 1.485000e+06 |
In [54]:
# Univariate Analysis
# Pie chart for Fuel Type
plt.figure(figsize=(8, 6))
fuel_counts = fuel_performance_dataset_clean['type_of_fuel'].value_counts()
# Create pie chart without labels in the pie, and add a legend instead
plt.pie(fuel_counts, autopct='%1.1f%%', startangle=90, colors=['#ff9999','#66b3ff', '#99ff99', '#ffcc99'])
plt.title('Pie Chart: Fuel Type Distribution')
# Add a legend to show the labels
plt.legend(labels=fuel_counts.index, loc="best")
plt.tight_layout()
plt.show()
In [55]:
# Histogram for Car Mileage
plt.figure(figsize=(8, 6))
sns.histplot(fuel_performance_dataset_clean['car_mileage'], kde=True, bins=5, color='green')
plt.title('Histogram: Car Mileage Distribution')
plt.xlabel('Car Mileage (kmpl)')
plt.ylabel('Frequency')
plt.show()
In [56]:
# Density Plot for Price Distribution
plt.figure(figsize=(8, 6))
sns.kdeplot(fuel_performance_dataset_clean['price'], fill=True, color='purple')
plt.title('Density Plot: Price Distribution')
plt.xlabel('Price (INR)')
plt.ylabel('Density')
plt.show()
In [57]:
#Bivariate Analysis
# Box Plot for Car Mileage by Fuel Type
plt.figure(figsize=(8, 6))
sns.boxplot(x='type_of_fuel', y='car_mileage', data=fuel_performance_dataset_clean, palette="Set2")
plt.title('Box Plot: Car Mileage by Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Car Mileage (kmpl)')
plt.show()
In [58]:
# Violin Plot for Price by Fuel Type
plt.figure(figsize=(8, 6))
sns.violinplot(x='type_of_fuel', y='price', data=fuel_performance_dataset_clean, palette="Set2")
plt.title('Violin Plot: Price by Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Price (INR)')
plt.show()
In [59]:
# Multivariate Analysis
sns.pairplot(fuel_performance_dataset_clean, hue='type_of_fuel', height=3, diag_kind='kde')
plt.suptitle('Pairplot: Fuel Type, Car Mileage, and Price', y=1.02)
plt.show()
In [60]:
# FacetGrid: Car Mileage vs Price by Fuel Type
g = sns.FacetGrid(fuel_performance_dataset_clean, col="type_of_fuel", height=5, aspect=1.2)
g.map(sns.scatterplot, "car_mileage", "price", s=100, alpha=0.7)
g.add_legend()
g.set_axis_labels("Car Mileage (kmpl)", "Price (INR)")
g.set_titles(col_template="{col_name} Cars")
plt.suptitle('FacetGrid: Price vs Car Mileage by Fuel Type', y=1.02)
plt.tight_layout()
plt.show()
In [61]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
# Standardize the data (scaling the features so they have unit variance and mean of 0)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(fuel_performance_dataset_clean[['car_mileage', 'price']])
# Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_scaled)
# Create a color map for fuel type
color_map = fuel_performance_dataset_clean['type_of_fuel'].map({'Petrol': 'blue', 'Diesel': 'green','LPG':'purple','CNG':'red'})
# Plotting the PCA results
plt.figure(figsize=(8, 6))
plt.scatter(pca_result[:, 0], pca_result[:, 1], c=color_map, s=100, alpha=0.7)
plt.title('PCA: Car Mileage and Price by Fuel Type')
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]*100:.2f}% Variance)')
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]*100:.2f}% Variance)')
plt.grid(True)
plt.show()
# Print the explained variance ratio
explained_variance = pca.explained_variance_ratio_
print(f"Explained variance by each component: {explained_variance}")
Explained variance by each component: [0.59871723 0.40128277]
In [ ]: